In [1]:
    
import pandas as pd
import numpy as np
import datetime
from datetime import date
from dateutil.rrule import rrule, DAILY
from __future__ import division
import geoplotlib as glp
from geoplotlib.utils import BoundingBox, DataAccessObject
pd.set_option('display.max_columns', None)
%matplotlib inline
    
In [2]:
    
# Read and filter weather data
weather = pd.read_csv('datasets/weather_data_nyc_kjfk_clean2.csv')
incidents = pd.read_csv("datasets/NYPD_Motor_Vehicle_Collisions_weather4.csv")
weather['date'] = weather.Year.astype('str') +'/'+ weather.Month.astype('str') \
                  +'/'+ weather.Day.astype('str') +'/'+ weather.Hour.astype('str')
    
    
Frequency of measured weather conditions from 7/1/2012 to 3/1/2016, on hourly basis
In [3]:
    
# Initialize condition dictionary
conditions = list(weather.Conditions.unique())
condic = {}
for cond in conditions:
    condic[cond] = 0;
# Fill condic with every occurrence of incident in given weather condition
for d in weather.date.unique():
    condi = weather[weather.date == d]['Conditions'].iloc[0]
    condic[condi] += 1
condic
    
    Out[3]:
In [4]:
    
# Get frequency of collision per hour of specific weather condition
conditionCount = {}
for c in incidents.Conditions.unique():
    if (pd.notnull(c)):
        mask = ((incidents.Conditions == c))
        filtered_incidents = incidents[mask]
        conditionCount[c] = filtered_incidents.size
    
conditionCount
    
    Out[4]:
In [8]:
    
# Calculate ratios
ratios = {}
for k,v in conditionCount.iteritems():
    conditionCountValue = conditionCount[k]
    weatherConditionCountValue = condic[k]
    ratio =  conditionCountValue / weatherConditionCountValue
    ratios[k] = ratio
    #print "%s: %s" % (k, ratio)
# Normalize on Mostly Cloudy (Most common weather condition)
reference = ratios["Mostly Cloudy"]
for k in ratios:
    ratios[k] = (ratios[k]/reference)*100
ratios
    
    Out[8]:
In [9]:
    
# Plot ratios
df = pd.DataFrame(pd.Series(ratios, name="Collision Frequency (Normalized)").sort_values())
df.plot(kind='barh', figsize=(8,8))
    
    Out[9]:
    
In [7]:
    
# Export to json for d3 viz
from collections import OrderedDict
import json
with open('datasets/freq_weather2.json', 'w') as fp:
    json.dump(OrderedDict(sorted(ratios.items(), key=lambda x: x[1], reverse=True)), fp)
    
In [ ]: